"""
Phase 6: Additional analyses for referee response.

Addresses the tension between parametric significance (Z1 x resource = +0.154***)
and bootstrap CI that includes zero. Four targeted tests:

  1. Drop Norway from OECD split (Norway = SWF outlier)
  2. KAOPEN interaction with commodity status (capital openness channel)
  3. Persistent commodity exporter definition (tighter classification)
  4. SWF era split (post-2000 weakening hypothesis)
"""

import sys
import numpy as np
import pandas as pd
from pathlib import Path

sys.path.insert(0, '/mnt/c/demographics_capital_flows/multilateral/src')
from model import PanelGLS

# ── Paths ─────────────────────────────────────────────────────────────────
DATA_PATH = Path('/mnt/c/demographics_capital_flows/multilateral/140_country/data/processed/full_panel_with_resources.csv')
OUTPUT_DIR = Path('/mnt/c/demographics_capital_flows/commodity_demographics/output/tables')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# ── Variable definitions ──────────────────────────────────────────────────
CONTROLS = ['fiscal_bal_gdp', 'nfa_gdp_lag', 'rgdp_growth', 'log_rel_opw', 'kaopen']
BASE_VARS = ['Z_1', 'Z_2', 'Z_3'] + CONTROLS
CORE_VARS = BASE_VARS + ['resource_rents_gdp', 'Z1_x_resource']
INTERACTION_VAR = 'Z1_x_resource'

OECD = {
    'AUS', 'AUT', 'BEL', 'CAN', 'CHL', 'COL', 'CRI', 'CZE', 'DNK', 'EST',
    'FIN', 'FRA', 'DEU', 'GRC', 'HUN', 'ISL', 'IRL', 'ISR', 'ITA', 'JPN',
    'KOR', 'LVA', 'LTU', 'LUX', 'MEX', 'NLD', 'NZL', 'NOR', 'POL', 'PRT',
    'SVK', 'SVN', 'ESP', 'SWE', 'CHE', 'TUR', 'GBR', 'USA',
}


# ══════════════════════════════════════════════════════════════════════════
# Helper functions
# ══════════════════════════════════════════════════════════════════════════

def load_data():
    """Load and filter the panel to year <= 2024."""
    df = pd.read_csv(DATA_PATH)
    df = df[df['year'] <= 2024].copy()
    df['Z1_x_resource'] = df['Z_1'] * df['resource_rents_gdp']
    return df


def prepare(df, vars_needed=None):
    """Drop rows with NaN in required columns."""
    if vars_needed is None:
        vars_needed = CORE_VARS
    sub = df.dropna(subset=['ca_gdp'] + vars_needed).copy()
    return sub


def run_model(df, var_list):
    """Run PanelGLS on given variables, return model."""
    sub = prepare(df, vars_needed=var_list)
    gls = PanelGLS()
    gls.fit(sub['ca_gdp'].values, sub[var_list].values,
            sub['iso3'].values, sub['year'].values)
    return gls, sub


def extract_coef(model, var_list, var_name):
    """Extract coefficient, SE, p for a named variable."""
    idx = var_list.index(var_name)
    return model.beta[idx], model.se[idx], model.pvalues[idx]


def fmt_row(test, spec, model, var_list, var_name, notes=''):
    """Build a results row dict."""
    beta, se, p = extract_coef(model, var_list, var_name)
    return {
        'test': test,
        'specification': spec,
        'n_obs': model.n_obs,
        'r_squared': round(model.r_squared, 4),
        'coef_name': var_name,
        'beta': round(beta, 4),
        'se': round(se, 4),
        'p': round(p, 4),
        'notes': notes,
    }


# ══════════════════════════════════════════════════════════════════════════
# 1. Drop Norway from OECD split
# ══════════════════════════════════════════════════════════════════════════

def test_oecd_drop_norway(df):
    """OECD with and without Norway — is NOR driving the interaction?"""
    print("\n" + "=" * 70)
    print("TEST 1: DROP NORWAY FROM OECD SPLIT")
    print("=" * 70)
    rows = []

    # 1a. Full OECD
    oecd_df = df[df['iso3'].isin(OECD)].copy()
    model, sub = run_model(oecd_df, CORE_VARS)
    beta, se, p = extract_coef(model, CORE_VARS, INTERACTION_VAR)
    n_countries = sub['iso3'].nunique()
    print(f"\n  OECD (all {n_countries} countries):")
    print(f"    N={model.n_obs}, R²={model.r_squared:.4f}")
    print(f"    Z₁×Resource = {beta:.4f} (SE={se:.4f}, p={p:.4f})")
    rows.append(fmt_row('1. OECD Split', f'OECD (all, {n_countries} countries)',
                         model, CORE_VARS, INTERACTION_VAR))

    # 1b. OECD excluding Norway
    oecd_no_nor = df[df['iso3'].isin(OECD - {'NOR'})].copy()
    model2, sub2 = run_model(oecd_no_nor, CORE_VARS)
    beta2, se2, p2 = extract_coef(model2, CORE_VARS, INTERACTION_VAR)
    n_countries2 = sub2['iso3'].nunique()
    print(f"\n  OECD excluding Norway ({n_countries2} countries):")
    print(f"    N={model2.n_obs}, R²={model2.r_squared:.4f}")
    print(f"    Z₁×Resource = {beta2:.4f} (SE={se2:.4f}, p={p2:.4f})")

    change = ((beta2 - beta) / abs(beta) * 100) if beta != 0 else np.nan
    note = f'Excl NOR; coef change: {change:+.1f}%'
    rows.append(fmt_row('1. OECD Split', f'OECD excl Norway ({n_countries2} countries)',
                         model2, CORE_VARS, INTERACTION_VAR, notes=note))

    print(f"\n  => Coefficient change from dropping Norway: {change:+.1f}%")
    return rows


# ══════════════════════════════════════════════════════════════════════════
# 2. KAOPEN interaction with commodity status
# ══════════════════════════════════════════════════════════════════════════

def test_kaopen_interaction(df):
    """Does capital account openness weaken commodity-demographic amplification?"""
    print("\n" + "=" * 70)
    print("TEST 2: KAOPEN × COMMODITY INTERACTION")
    print("=" * 70)
    rows = []

    # Create interaction terms
    df = df.copy()
    df['Z1_x_kaopen'] = df['Z_1'] * df['kaopen']
    df['resource_x_kaopen'] = df['resource_rents_gdp'] * df['kaopen']
    df['Z1_x_resource_x_kaopen'] = df['Z_1'] * df['resource_rents_gdp'] * df['kaopen']

    # 2a. Full model with triple interaction
    triple_vars = CORE_VARS + ['Z1_x_kaopen', 'resource_x_kaopen', 'Z1_x_resource_x_kaopen']
    model, sub = run_model(df, triple_vars)

    print("\n  Full model with KAOPEN triple interaction:")
    print(f"    N={model.n_obs}, R²={model.r_squared:.4f}")
    for vname in ['Z1_x_resource', 'Z1_x_kaopen', 'resource_x_kaopen', 'Z1_x_resource_x_kaopen']:
        b, s, p = extract_coef(model, triple_vars, vname)
        sig = '***' if p < 0.01 else '**' if p < 0.05 else '*' if p < 0.1 else ''
        print(f"    {vname}: {b:.4f} (SE={s:.4f}, p={p:.4f}){sig}")
        rows.append(fmt_row('2. KAOPEN Interaction', 'Triple interaction model',
                             model, triple_vars, vname))

    # 2b. Split by KAOPEN median
    kaopen_median = df['kaopen'].median()
    print(f"\n  KAOPEN median: {kaopen_median:.3f}")

    for label, mask in [
        ('Low KAOPEN (closed)', df['kaopen'] <= kaopen_median),
        ('High KAOPEN (open)', df['kaopen'] > kaopen_median),
    ]:
        sub_df = df[mask].copy()
        model_s, sub_s = run_model(sub_df, CORE_VARS)
        beta, se, p = extract_coef(model_s, CORE_VARS, INTERACTION_VAR)
        n_c = sub_s['iso3'].nunique()
        sig = '***' if p < 0.01 else '**' if p < 0.05 else '*' if p < 0.1 else ''
        print(f"\n  {label} ({n_c} countries):")
        print(f"    N={model_s.n_obs}, R²={model_s.r_squared:.4f}")
        print(f"    Z₁×Resource = {beta:.4f} (SE={se:.4f}, p={p:.4f}){sig}")
        rows.append(fmt_row('2. KAOPEN Interaction', f'KAOPEN split: {label}',
                             model_s, CORE_VARS, INTERACTION_VAR,
                             notes=f'{n_c} countries, median={kaopen_median:.3f}'))

    return rows


# ══════════════════════════════════════════════════════════════════════════
# 3. Persistent commodity exporter definition
# ══════════════════════════════════════════════════════════════════════════

def test_persistent_exporter(df):
    """Tighter definition: countries that are commodity exporters for >= N years."""
    print("\n" + "=" * 70)
    print("TEST 3: PERSISTENT COMMODITY EXPORTER DEFINITION")
    print("=" * 70)
    rows = []

    df = df.copy()

    # Count years with resource_rents_gdp >= 10% for each country
    rents_flag = df.dropna(subset=['resource_rents_gdp']).copy()
    rents_flag['is_high'] = (rents_flag['resource_rents_gdp'] >= 10).astype(int)
    years_high = rents_flag.groupby('iso3')['is_high'].sum().reset_index()
    years_high.columns = ['iso3', 'n_years_high']

    # Also count total years with data
    total_years = rents_flag.groupby('iso3').size().reset_index(name='n_years_total')
    years_high = years_high.merge(total_years, on='iso3')

    # "Any year" definition — how many countries have at least 1 year >= 10%
    any_year = set(years_high[years_high['n_years_high'] >= 1]['iso3'])

    # Persistent definitions
    for threshold_years, label in [(5, '>= 5 years'), (10, '>= 10 years')]:
        persistent = set(years_high[years_high['n_years_high'] >= threshold_years]['iso3'])
        print(f"\n  Persistent commodity exporter ({label}):")
        print(f"    {len(persistent)} countries qualify (vs {len(any_year)} with any year >= 10%)")
        print(f"    Countries: {sorted(persistent)}")

        df[f'persistent_{threshold_years}'] = df['iso3'].isin(persistent).astype(float)
        df[f'Z1_x_persistent_{threshold_years}'] = df['Z_1'] * df[f'persistent_{threshold_years}']

        var_list = BASE_VARS + [f'persistent_{threshold_years}', f'Z1_x_persistent_{threshold_years}']
        model, sub = run_model(df, var_list)
        var_name = f'Z1_x_persistent_{threshold_years}'
        beta, se, p = extract_coef(model, var_list, var_name)
        sig = '***' if p < 0.01 else '**' if p < 0.05 else '*' if p < 0.1 else ''
        print(f"    N={model.n_obs}, R²={model.r_squared:.4f}")
        print(f"    Z₁×Persistent = {beta:.4f} (SE={se:.4f}, p={p:.4f}){sig}")

        rows.append({
            'test': '3. Persistent Exporter',
            'specification': f'Persistent ({label})',
            'n_obs': model.n_obs,
            'r_squared': round(model.r_squared, 4),
            'coef_name': var_name,
            'beta': round(beta, 4),
            'se': round(se, 4),
            'p': round(p, 4),
            'notes': f'{len(persistent)} countries qualify; vs {len(any_year)} any-year',
        })

    # Also run continuous interaction for comparison
    print(f"\n  Continuous interaction (baseline for comparison):")
    model_c, _ = run_model(df, CORE_VARS)
    beta_c, se_c, p_c = extract_coef(model_c, CORE_VARS, INTERACTION_VAR)
    sig = '***' if p_c < 0.01 else '**' if p_c < 0.05 else '*' if p_c < 0.1 else ''
    print(f"    Z₁×Resource (continuous) = {beta_c:.4f} (SE={se_c:.4f}, p={p_c:.4f}){sig}")
    rows.append(fmt_row('3. Persistent Exporter', 'Continuous Z1×Resource (comparison)',
                         model_c, CORE_VARS, INTERACTION_VAR,
                         notes='Baseline continuous interaction'))

    return rows


# ══════════════════════════════════════════════════════════════════════════
# 4. SWF era split
# ══════════════════════════════════════════════════════════════════════════

def test_swf_era(df):
    """Test whether the interaction weakened post-2000 (SWF institutionalization)."""
    print("\n" + "=" * 70)
    print("TEST 4: SWF ERA SPLIT (POST-2000)")
    print("=" * 70)
    rows = []

    df = df.copy()
    df['swf_era'] = (df['year'] >= 2000).astype(float)
    df['Z1_x_resource_x_swf'] = df['Z_1'] * df['resource_rents_gdp'] * df['swf_era']
    df['resource_x_swf'] = df['resource_rents_gdp'] * df['swf_era']
    df['Z1_x_swf'] = df['Z_1'] * df['swf_era']

    # 4a. Triple interaction model
    triple_vars = CORE_VARS + ['Z1_x_swf', 'resource_x_swf', 'Z1_x_resource_x_swf']
    model, sub = run_model(df, triple_vars)

    print("\n  Triple interaction model (SWF era dummy = year >= 2000):")
    print(f"    N={model.n_obs}, R²={model.r_squared:.4f}")
    for vname in ['Z1_x_resource', 'Z1_x_swf', 'resource_x_swf', 'Z1_x_resource_x_swf']:
        b, s, p = extract_coef(model, triple_vars, vname)
        sig = '***' if p < 0.01 else '**' if p < 0.05 else '*' if p < 0.1 else ''
        print(f"    {vname}: {b:.4f} (SE={s:.4f}, p={p:.4f}){sig}")
        rows.append(fmt_row('4. SWF Era', 'Triple interaction (SWF era)',
                             model, triple_vars, vname))

    # 4b. Period splits
    for label, mask in [
        ('Pre-2000 (pre-SWF)', df['year'] < 2000),
        ('Post-2000 (SWF era)', df['year'] >= 2000),
    ]:
        sub_df = df[mask].copy()
        model_s, sub_s = run_model(sub_df, CORE_VARS)
        beta, se, p = extract_coef(model_s, CORE_VARS, INTERACTION_VAR)
        n_c = sub_s['iso3'].nunique()
        sig = '***' if p < 0.01 else '**' if p < 0.05 else '*' if p < 0.1 else ''
        print(f"\n  {label} ({n_c} countries):")
        print(f"    N={model_s.n_obs}, R²={model_s.r_squared:.4f}")
        print(f"    Z₁×Resource = {beta:.4f} (SE={se:.4f}, p={p:.4f}){sig}")
        rows.append(fmt_row('4. SWF Era', f'Period split: {label}',
                             model_s, CORE_VARS, INTERACTION_VAR,
                             notes=f'{n_c} countries'))

    return rows


# ══════════════════════════════════════════════════════════════════════════
# Main
# ══════════════════════════════════════════════════════════════════════════

def main():
    print("=" * 70)
    print("COMMODITY DEMOGRAPHICS — PHASE 6: REFEREE REVISION ANALYSES")
    print("=" * 70)

    df = load_data()
    all_rows = []

    # 1. Drop Norway from OECD
    all_rows.extend(test_oecd_drop_norway(df))

    # 2. KAOPEN interaction
    all_rows.extend(test_kaopen_interaction(df))

    # 3. Persistent exporter definition
    all_rows.extend(test_persistent_exporter(df))

    # 4. SWF era split
    all_rows.extend(test_swf_era(df))

    # ── Save ──────────────────────────────────────────────────────────────
    results = pd.DataFrame(all_rows)
    out_path = OUTPUT_DIR / 'table_revisions.csv'
    results.to_csv(out_path, index=False)
    print(f"\n\nSaved: {out_path}")

    # ── Summary ───────────────────────────────────────────────────────────
    print("\n" + "=" * 70)
    print("FULL RESULTS TABLE")
    print("=" * 70)
    pd.set_option('display.width', 160)
    pd.set_option('display.max_colwidth', 50)
    pd.set_option('display.max_rows', 100)
    print(results.to_string(index=False))


if __name__ == '__main__':
    main()
